Load the necessary libraries
library(tidyverse) #for data wrangling
Link to the data transformation cheatsheet
https://github.com/rstudio/cheatsheets/raw/master/data-transformation.pdf
Important data manipulation functions:
| Task | Function | Package |
|---|---|---|
| Sorting | arrange() |
dplyr |
| Adding columns | mutate() |
dplyr |
| Transformations | mutate() |
dplyr |
| Re-ordering factor levels | factor(,levels=) |
base |
| Re-labelling | factor(,lab=) |
base |
recode() |
dplyr | |
| Re-naming columns | rename(,replace=) |
dplyr |
| Filtering/Subsetting | indexing | base |
| ~ columns | select(,...) |
dplyr |
pull(,...) |
dplyr | |
| ~ rows | filter(,...) |
dplyr |
| Unique combinations | distinct() |
dplyr |
| Reshaping data | pivot_longer(), pivot_wider() |
tidyr |
| Split/combine columns | separate(), unite() |
tidyr |
| Aggregating | group_by() summarise() |
dplyr |
group_by() count() |
dplyr | |
| Merging/joining | *_join() |
dplyr |
| Extracting data structure | expand() |
tidyr |
crossing() |
tidyr |
load(file='../data/manipulationDatasets.RData')
dat.1 %>% head
# Arrange by the Treatment, then Time
dat.1 %>%
arrange(Treatment, Time)
# Arrange by Treatment, then by the mean of Resp 1 and Resp2
dat.1 %>%
mutate(Resp = mean(c(Resp1, Resp2))) %>%
arrange(Treatment, Resp)
dat.1 %>%
mutate(
Dose = fct_relevel(Dose, c('L', 'M', 'H')),
Dose = fct_recode(Dose, High = "H", Medium = "M")) %>%
as_tibble %>%
pull(Dose)
## [1] High High High High Medium Medium Medium Medium L L
## [11] L L High High High High Medium Medium Medium Medium
## [21] L L L L High High High High Medium Medium
## [31] Medium Medium L L L L High High High High
## [41] Medium Medium Medium Medium L L L L
## Levels: L Medium High
dat.1 %>%
mutate(leadResp2 = lead(Resp2),
lagResp1 = lag(Resp1))
dat.1 %>%
mutate(rank_time = min_rank(Time),
dens_rank_time = dense_rank(Time))
dat.1 %>%
mutate(row_Resp1 = row_number(Resp1),
row_time = row_number(Time),
rank_time = min_rank(Time),
ntile_Resp1 = ntile(Resp1, 4),
medium_Resp1 = between(Resp1, 20, 40),
fResp1A = case_when(Resp1 < 31 ~ "Low",
between(Resp1, 31, 50) ~ "Medium",
Resp1 > 50 ~ "High"),
fResp1B = cut(Resp1, breaks = c(0, 31, 50, 200),
labels = c("Low", "Medium", "High")),
fResp1C = cut(Resp1, breaks = 2,
labels = c("Low", "High")))
SE <- function(x) sd(x) / sqrt(length(x))
dat.1 %>%
summarise(MeanResp1 = mean(Resp1), VarResp1 = var(Resp1),
SEM = SE(Resp1))
# Using anonymous function:
dat.1 %>% summarise(MeanResp1 = mean(Resp1), VarResp1 = var(Resp1),
SEM = (function(x) sd(x) / sqrt(length(x)))(Resp1))
# Summarize across a number of variables
dat.1 %>% summarise(across(c(Resp1, Resp2),
list(Mean = mean, Var = var)))
# Get mean and var
dat.1 %>% summarise(across(where(is.numeric),
list(Mean = mean, Var = var)))
# Get mean and length
dat.1 %>% summarize(across(where(is.numeric), mean),
across(where(is.factor), length))
Variable <- c("Resp1", "Resp2")
dat.1 %>% summarise(across(all_of(Variable), list(Mean = mean, Var = var)))
dat.1 %>%
arrange(Resp1) %>%
group_by(Treatment, Plot) %>%
summarise(Mean = mean(Resp1),
Var = var(Resp1),
N = n(),
First = first(Resp1))
## `summarise()` regrouping output by 'Treatment' (override with `.groups` argument)
tikus[1:10, c(1:3, 76:77)]
tikus %>%
# head(10) %>%
# select("Psammocora contigua", "Psammocora digitata", "Pocillopora damicornis", "time", "rep") %>%
arrange(`Pocillopora damicornis`) %>%
group_by(time) %>%
summarise(across(all_of(c("Psammocora contigua", "Psammocora digitata", "Pocillopora damicornis")), list(Mean = mean))) %>%
ungroup()
## `summarise()` ungrouping output (override with `.groups` argument)
dat.1 %>% dplyr::select(-starts_with(c("T","D")))
dat.1 %>% dplyr::select(contains("e"))
dat.1 %>% dplyr::select(where(is_numeric))
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
## Warning: Deprecated
dat.1 %>% select(matches("^.{4}$")) # regex for get only columns with 4-characters
nasa %>%
select(lat, long, starts_with("cloud"))
nasa %>%
select(lat, long, matches("^cloud.*$")) %>% head
# Select rep, time, but not any with pora
tikus %>% select(rep, time, !contains("pora")) %>%
ncol()
## [1] 40
# Same thing, but older way of doing it
tikus %>% select(rep, time, everything(), -contains("pora")) %>%
ncol()
## [1] 40
dat.1 %>% rename(Exposure = Treatment, Richness = Resp1) %>%
relocate(Richness, .before=Plot)
https://github.com/rstudio/cheatsheets/raw/master/data-transformation.pdf
dat.1 %>% filter(Dose == "H" & Resp1 < 25) %>% nrow
## [1] 6
dat.1 %>% filter(Dose == "H" | Resp1 < 25) %>% nrow
## [1] 18
Be careful about not being ambiguous!
dat.1 %>%
filter(Resp1 < 40 & (Time > 1 | Dose == "L")) %>% nrow
## [1] 10
dat.1 %>%
filter((Resp1 < 40 & Time > 1) | Dose == "L") %>% nrow
## [1] 24
nasa %>%
dplyr::filter(month == 2, year == max(year)) %>%
filter(ozone == max(ozone))
nasa %>%
dplyr::filter(month == 2 & year == max(year)) %>%
arrange(-ozone) %>% slice(1)
nasa %>% filter(month == 1, year == max(year)) %>%
filter(ozone > 320 & ozone < 325)
nasa %>%
filter(month == 1, year == max(year), between(ozone, 320, 325))
Always make sure to drop ‘ghost’ levels
dat.1 %>%
filter(Plot == "P1") %>%
pull(Plot) %>%
levels
## [1] "P1" "P2" "P3" "P4"
dat.1 %>%
filter(Plot == "P1") %>%
pull(Plot) %>%
droplevels()
## [1] P1 P1 P1 P1 P1 P1 P1 P1 P1 P1 P1 P1
## Levels: P1
data.w %>%
pivot_longer(cols = starts_with("Time"),
names_to = "time", values_to = "min") %>%
mutate(time = as.numeric(str_extract(time, "(?<=Time.).")))
# easier way:
data.w %>%
pivot_longer(cols = starts_with("Time"),
names_to = "time", values_to = "min",
names_prefix = "Time.") %>% # makes it way easier!!
mutate(time = as.numeric(time))
data %>%
pivot_wider(names_from = Within, values_from = c(Resp1, Resp2))
data.bio <- data.bio %>% relocate(Resp1, Resp2, .after=Subplot)
data.bio %>% full_join(data.chem) %>%
arrange(Plot, Subplot)
## Joining, by = c("Between", "Plot", "Subplot")
data.bio %>% left_join(data.chem) %>%
arrange(Plot, Subplot)
## Joining, by = c("Between", "Plot", "Subplot")
data.bio %>% right_join(data.chem) %>%
arrange(Plot, Subplot)
## Joining, by = c("Between", "Plot", "Subplot")
data.bio %>% inner_join(data.chem) %>%
arrange(Plot, Subplot)
## Joining, by = c("Between", "Plot", "Subplot")
data.bio %>% rbind(data.bio[6:9,]) %>%
left_join(data.chem) %>%
arrange(Plot, Subplot) # same as vlookup!
## Joining, by = c("Between", "Plot", "Subplot")
summary_tikus <- tikus %>%
rename(`Acropora aspera` = `Acropera aspera`) %>%
select(time, rep, starts_with("Acropora")) %>%
pivot_longer(cols = c(-rep, -time), names_to = "species", values_to = "abundance") %>%
mutate(cover = abundance / (10*100),
per_cover = cover * 100) %>%
group_by(time, rep) %>%
summarise(total_acropora_cover = sum(cover)) %>%
ungroup() %>%
group_by(time) %>% # averaging across all sites now
summarise(mean_yearly_acropora_cover = mean(total_acropora_cover),
sd_yearly_acropora_cover = sd(total_acropora_cover)) %>%
ungroup() %>%
mutate(year = 1900 + as.numeric(as.character(time)))
## `summarise()` regrouping output by 'time' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
summary_tikus <-
tikus %>%
rename(`Acropora aspera` = `Acropera aspera`) %>%
# select(time, rep, starts_with("Acropora")) %>%
pivot_longer(cols = c(-rep, -time), names_to = "species", values_to = "abundance") %>%
mutate(cover = abundance / (10*100),
per_cover = cover * 100) %>%
separate(col = species, c("genus", "species")) %>%
group_by(genus, time, rep) %>%
summarise(total_cover = sum(cover)) %>%
ungroup() %>%
mutate(genus = fct_lump_n(genus, 5, total_cover)) %>%
group_by(genus, time) %>% # averaging across all sites now
summarise(mean_yearly_cover = mean(total_cover),
sd_yearly_cover = sd(total_cover)) %>%
ungroup() %>%
mutate(year = 1900 + as.numeric(as.character(time)))
## `summarise()` regrouping output by 'genus', 'time' (override with `.groups` argument)
## `summarise()` regrouping output by 'genus' (override with `.groups` argument)
theme_set(theme_classic())
summary_tikus %>%
ggplot(aes(x = year, y = mean_yearly_cover, group = genus)) +
facet_wrap(~genus) +
geom_line(linetype = "dashed") +
geom_area(aes(fill = genus), alpha = 0.5) +
labs(x = "Year", y = "Mean Acropora Cover") +
scale_y_continuous(labels = scales::label_percent(),
expand = expansion()) +
scale_x_continuous(expand = expansion())